!conda install hdbscan --yes
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from collections import Counter
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import linkage
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import SpectralClustering
from sklearn.metrics import pairwise_distances_argmin
from sklearn.metrics import completeness_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import homogeneity_score
from sklearn.datasets.samples_generator import make_blobs
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import hdbscan
import yellowbrick
import yellowbrick.datasets
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.cluster import SilhouetteVisualizer
from yellowbrick.cluster import InterclusterDistance
X_blob, y_blob = make_blobs(n_samples = 500, centers = 5, cluster_std=1.5, random_state=538)
plt.figure(figsize=(10,8))
plt.scatter(X_blob[:,0], X_blob[:,1], s=50, c=y_blob, cmap='plasma')
resultsDF = pd.DataFrame(columns=['Homogeneity', 'Completeness', 'Silhouette'])
def accuracy(X, labels, truth=None):
score_silhouette = silhouette_score(X, labels, metric='euclidean')
print('Silhouette : {:.3f}'.format(score_silhouette))
score_homogeneity = None
score_completeness = None
if truth is not None:
score_homogeneity = homogeneity_score(labels, truth)
score_completeness = completeness_score(labels, truth)
print('Homogeneity : {:.3f}'.format(score_homogeneity))
print('Completeness : {:.3f}'.format(score_completeness))
return score_homogeneity, score_completeness, score_silhouette
def score_model(name, X, labels, truth):
score = accuracy(X, labels, truth)
resultsDF.loc[name] = [*score]
def initialize(X, pick, seed):
generator = np.random.seed(seed)
indexes = np.random.choice(X.shape[0], pick, replace=False)
return X[indexes]
def assignClusters(X, centroids):
return pairwise_distances_argmin(X, centroids)
def computeCentroids(X, labels, clusters):
return np.array( [X[labels == index].mean(0) for index in range(clusters)] )
def clusterData(X, clusters, seed=1337, iterations=50):
iteration = 0
converged = False
centroids = initialize(X, clusters, seed)
while not converged:
assignment = assignClusters(X, centroids)
updated = computeCentroids(X, assignment, clusters)
if np.all(centroids == updated) or iteration > iterations:
converged = True
centroids = updated
iteration += 1
return centroids, assignment
@interact
def ExpectationMax(EM_CLUSTERS=7):
EM_centroids, EM_labels = clusterData(X_blob, EM_CLUSTERS)
plt.figure(figsize=(10,8))
plt.scatter(X_blob[:, 0], X_blob[:, 1], c=EM_labels, s=50, cmap='plasma')
plt.scatter(EM_centroids[:, 0], EM_centroids[:, 1], s=100, c='gray')
score_model('EM', X_blob, EM_labels, y_blob)
@interact
def SKLearnKMeans(SK_clusters=7):
km = KMeans(n_clusters=SK_clusters)
SK_labels = km.fit_predict(X_blob, y_blob)
SK_centroids = computeCentroids(X_blob, SK_labels, SK_clusters)
plt.figure(figsize=(10,8))
plt.scatter(X_blob[:, 0], X_blob[:, 1], c=SK_labels, s=50, cmap='plasma')
plt.scatter(SK_centroids[:, 0], SK_centroids[:, 1], s=100, c='gray')
score_model('KMeans', X_blob, SK_labels, y_blob)
@interact
def Elbow(Method=['distortion', 'silhouette', 'calinski_harabaz']):
plt.figure(figsize=(10, 8))
model = KMeans()
visualizer = KElbowVisualizer(model, k=(2,12), metric=Method, locate_elbow=True)
visualizer.fit(X_blob)
@interact
def Elbow(Clusters=(2,10)):
plt.figure(figsize=(10, 8))
model = KMeans(Clusters)
visualizer = SilhouetteVisualizer(model, colors='plasma')
visualizer.fit(X_blob)
@interact
def Interdistance(Clusters=(2,10)):
plt.figure(figsize=(10, 8))
model = KMeans(Clusters)
visualizer = InterclusterDistance(model, colors='plasma')
visualizer.fit(X_blob)
@interact
def AGGLOMERATIVEInteractive(AGG_clusters=(2,10)):
agg = AgglomerativeClustering(n_clusters=AGG_clusters)
AGG_labels = agg.fit_predict(X_blob, y_blob)
AGG_centroids = computeCentroids(X_blob, AGG_labels, AGG_clusters)
plt.figure(figsize=(10,8))
plt.scatter(X_blob[:, 0], X_blob[:, 1], c=AGG_labels, s=50, cmap='plasma')
plt.scatter(AGG_centroids[:, 0], AGG_centroids[:, 1], s=100, c='gray')
score_model('Agglomerative', X_blob, AGG_labels, y_blob)
plt.figure(figsize=(10,8))
Z = linkage(X_blob)
out = dendrogram(Z, p=10, truncate_mode='level')
ZMatrix = pd.DataFrame(Z, columns = ['Cluster-1', 'Cluster-2', 'Distance', 'NewSize'])
ZMatrix
@interact
def SPECTRALInteractive(CLUSTERS=(2,10)):
spectral = SpectralClustering(n_clusters=CLUSTERS)
SPECTRAL_labels = spectral.fit_predict(X_blob)
SPECTRAL_centroids = computeCentroids(X_blob, SPECTRAL_labels, len(set(SPECTRAL_labels)))
plt.figure(figsize=(10,8))
plt.scatter(X_blob[:, 0], X_blob[:, 1], c=SPECTRAL_labels, s=50, cmap='plasma')
plt.scatter(SPECTRAL_centroids[:, 0], SPECTRAL_centroids[:, 1], s=100, c='gray')
score_model('Spectral', X_blob, SPECTRAL_labels, y_blob)
@interact
def DBSCANInteractive(EPS=(0, 5.0), MIN_SAMPLES=(1,30)):
warnings.filterwarnings('ignore')
dbscan = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES)
DBSCAN_labels = dbscan.fit_predict(X_blob)
DBSCAN_centroids = computeCentroids(X_blob, DBSCAN_labels, len(set(DBSCAN_labels)))
plt.figure(figsize=(10,8))
plt.scatter(X_blob[:, 0], X_blob[:, 1], c=DBSCAN_labels, s=50, cmap='plasma')
plt.scatter(DBSCAN_centroids[:, 0], DBSCAN_centroids[:, 1], s=100, c='gray')
print('Clusters {}'.format(len(set(DBSCAN_labels))))
score_model('DBSCAN', X_blob, DBSCAN_labels, y_blob)
@interact
def HDBSCANInteractive(MIN_CLUSTER_SIZE=(30), MIN_SAMPLES=(30)):
warnings.filterwarnings('ignore')
hdb = hdbscan.HDBSCAN(min_cluster_size=MIN_CLUSTER_SIZE,
min_samples=MIN_SAMPLES)
HDBSCAN_labels = hdb.fit_predict(X_blob)
HDBSCAN_centroids = computeCentroids(X_blob, HDBSCAN_labels, len(set(HDBSCAN_labels)))
print('Clusters {}'.format(len(set(HDBSCAN_labels))))
score_model('HDBSCAN', X_blob, HDBSCAN_labels, y_blob)
plt.figure(figsize=(10,8))
plt.scatter(X_blob[:, 0], X_blob[:, 1], c=HDBSCAN_labels, s=50, cmap='plasma')
plt.scatter(HDBSCAN_centroids[:, 0], HDBSCAN_centroids[:, 1], s=100, c='gray')
plt.show()
plt.figure(figsize=(10,8))
hdb.single_linkage_tree_.plot(cmap='plasma', p=10, truncate_mode='lastp')
@interact
def showResults(SORT=['Completeness', 'Homogeneity', 'Silhouette']):
print(resultsDF.sort_values(SORT, ascending=False))